import jieba
import re
import random

def preprocess_file(INPUT_FILE_PATH, OUTPUT_FILE_PATH):
    input_file = open(INPUT_FILE_PATH, 'r', encoding = 'utf-8')
    output_file = open(OUTPUT_FILE_PATH, 'w', encoding = 'utf-8')
    for line in input_file:
        if line == '\n':
            continue
        
        line = re.sub('[^\u4e00-\u9fa5]+', '', line) # 过滤行内非中文字符
        word_list = jieba.lcut(line) # 分词得到行
        
        line = ""

        i = 0
        for word in word_list:
            if i == 0:
                i = 1
                line = line + word
                continue
            line = line + ' '
            line = line + word

        rand_tag = random.randint(0, 3) # 随机给该行生成一个标签

        line = line + '\t' + str(rand_tag) + '\n'
        output_file.write(line)

def preprocess(sentences, OUTPUT_FILE_PATH):
    output_file = open(OUTPUT_FILE_PATH, 'w', encoding = 'utf-8')
    for line in sentences.split('\n'):
        if line.strip() == '':
            continue
        line = re.sub('[^\u4e00-\u9fa5]+', '', line) # 过滤行内非中文字符
        word_list = jieba.lcut(line) # 分词得到行
        
        line = ""

        i = 0
        for word in word_list:
            if i == 0:
                i = 1
                line = line + word
                continue
            line = line + ' '
            line = line + word

        rand_tag = random.randint(0, 3) # 随机给该行生成一个标签

        line = line + '\t' + str(rand_tag) + '\n'
        output_file.write(line)

if __name__=='__main__':
    # 指定输入输出文件路径
    INPUT_FILE_PATH = "./input.txt"
    OUTPUT_FILE_PATH = "./THUCNews/data/testone.txt"
    preprocess_file(INPUT_FILE_PATH, OUTPUT_FILE_PATH)